{
 "nbformat": 4,
 "nbformat_minor": 2,
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "version": "3.7.3-final"
  },
  "orig_nbformat": 2,
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
   "name": "python271664bitenvvirtualenv8836d05012704695b9eba33d763507b5",
   "display_name": "Python 2.7.16 64-bit ('env': virtualenv)"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.read_csv(\"data/aggregate_results_env.csv\", index_col=0)\n",
    "df1 = df1[['doi', 'file', 'success']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "7414 2085\n"
    }
   ],
   "source": [
    "print(str(len(df1))+' '+str(len(df1.doi.unique())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "NaN    3719\n0.0    2223\n1.0    1472\nName: success, dtype: int64"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.success.value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = pd.read_csv(\"data/aggregate_results_no_env.csv\", index_col=0)\n",
    "df2 = df2[['doi', 'file', 'success']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "7659 2071\n"
    }
   ],
   "source": [
    "print(str(len(df2))+' '+str(len(df2.doi.unique())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "NaN    3829\n0.0    2878\n1.0     952\nName: success, dtype: int64"
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.success.value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfd = df1.merge(df2, on=['doi','file'], how='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "8609 2109\n"
    }
   ],
   "source": [
    "print(str(len(dfd))+' '+str(len(dfd.doi.unique())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_result(r):\n",
    "    if 1.0 in [r.success_x, r.success_y]:\n",
    "        return 1.0\n",
    "    if pd.isnull(r.success_x) or pd.isnull(r.success_y):\n",
    "        return np.nan  \n",
    "    else:\n",
    "        return 0.0\n",
    "\n",
    "dfd['result'] = dfd.apply(get_result, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "NaN    5790\n1.0    1581\n0.0    1238\nName: result, dtype: int64"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfd.result.value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"data/rfile_stats.csv\", delimiter='\\t', \\\n",
    "                 names=['doi', 'file', 'comments_no', 'dep_no', 'func_no', \\\n",
    "                        'test_no', 'class_no','encoding', 'total_lines'])\n",
    "\n",
    "df = df[['doi', 'file']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>file</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>doi:10.7910/DVN/XFQZI2</td>\n      <td>Condemnation.R</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>doi:10.7910/DVN/WGPDBS</td>\n      <td>Replication_of_Figures.R</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>doi:10.7910/DVN/BPON3K</td>\n      <td>fig_10_effect_of_winning_on_gov.R</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>doi:10.7910/DVN/BPON3K</td>\n      <td>fig_11_rd_placebo.R</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>doi:10.7910/DVN/BPON3K</td>\n      <td>fig_12_historical_trend.R</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                      doi                               file\n0  doi:10.7910/DVN/XFQZI2                     Condemnation.R\n1  doi:10.7910/DVN/WGPDBS           Replication_of_Figures.R\n2  doi:10.7910/DVN/BPON3K  fig_10_effect_of_winning_on_gov.R\n3  doi:10.7910/DVN/BPON3K                fig_11_rd_placebo.R\n4  doi:10.7910/DVN/BPON3K          fig_12_historical_trend.R"
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# All success files with code cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "8178 7414\n"
    }
   ],
   "source": [
    "print('{} {}'.format(len(df), len(df1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# to avoid DOIs that had error in download exclude them from df\n",
    "good_dois = df1.doi.unique()\n",
    "df = df[df['doi'].isin(good_dois)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "7928"
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfm = df.merge(df1, on=['doi','file'], how='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "8875"
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dfm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "2085"
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dfm.doi.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>file</th>\n      <th>success</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>doi:10.7910/DVN/XFQZI2</td>\n      <td>Condemnation.R</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>doi:10.7910/DVN/WGPDBS</td>\n      <td>Replication_of_Figures.R</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>doi:10.7910/DVN/BPON3K</td>\n      <td>fig_10_effect_of_winning_on_gov.R</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>doi:10.7910/DVN/BPON3K</td>\n      <td>fig_11_rd_placebo.R</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>doi:10.7910/DVN/BPON3K</td>\n      <td>fig_12_historical_trend.R</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                      doi                               file  success\n0  doi:10.7910/DVN/XFQZI2                     Condemnation.R      NaN\n1  doi:10.7910/DVN/WGPDBS           Replication_of_Figures.R      1.0\n2  doi:10.7910/DVN/BPON3K  fig_10_effect_of_winning_on_gov.R      0.0\n3  doi:10.7910/DVN/BPON3K                fig_11_rd_placebo.R      1.0\n4  doi:10.7910/DVN/BPON3K          fig_12_historical_trend.R      1.0"
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfm.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "NaN    5180\n0.0    2223\n1.0    1472\nName: success, dtype: int64"
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfm.success.value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "2085"
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dfm.doi.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = dfm.groupby(['doi'])['success'].value_counts(dropna=False).unstack().fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "2085"
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th>success</th>\n      <th>nan</th>\n      <th>0.0</th>\n      <th>1.0</th>\n    </tr>\n    <tr>\n      <th>doi</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>doi:10.7910/DVN/03CDTK</th>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>doi:10.7910/DVN/05BSPP</th>\n      <td>17.0</td>\n      <td>3.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>doi:10.7910/DVN/0BFF0K</th>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>doi:10.7910/DVN/0BPVCH</th>\n      <td>3.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>doi:10.7910/DVN/0DE35E</th>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "success                  NaN  0.0  1.0\ndoi                                   \ndoi:10.7910/DVN/03CDTK   2.0  0.0  0.0\ndoi:10.7910/DVN/05BSPP  17.0  3.0  0.0\ndoi:10.7910/DVN/0BFF0K   0.0  1.0  1.0\ndoi:10.7910/DVN/0BPVCH   3.0  0.0  0.0\ndoi:10.7910/DVN/0DE35E   2.0  0.0  1.0"
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = dfs.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th>success</th>\n      <th>doi</th>\n      <th>nan</th>\n      <th>0.0</th>\n      <th>1.0</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>doi:10.7910/DVN/03CDTK</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>doi:10.7910/DVN/05BSPP</td>\n      <td>17.0</td>\n      <td>3.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>doi:10.7910/DVN/0BFF0K</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>doi:10.7910/DVN/0BPVCH</td>\n      <td>3.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>doi:10.7910/DVN/0DE35E</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "success                     doi   NaN  0.0  1.0\n0        doi:10.7910/DVN/03CDTK   2.0  0.0  0.0\n1        doi:10.7910/DVN/05BSPP  17.0  3.0  0.0\n2        doi:10.7910/DVN/0BFF0K   0.0  1.0  1.0\n3        doi:10.7910/DVN/0BPVCH   3.0  0.0  0.0\n4        doi:10.7910/DVN/0DE35E   2.0  0.0  1.0"
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>n</th>\n      <th>f</th>\n      <th>s</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>doi:10.7910/DVN/03CDTK</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>doi:10.7910/DVN/05BSPP</td>\n      <td>17.0</td>\n      <td>3.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>doi:10.7910/DVN/0BFF0K</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>doi:10.7910/DVN/0BPVCH</td>\n      <td>3.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>doi:10.7910/DVN/0DE35E</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                      doi     n    f    s\n0  doi:10.7910/DVN/03CDTK   2.0  0.0  0.0\n1  doi:10.7910/DVN/05BSPP  17.0  3.0  0.0\n2  doi:10.7910/DVN/0BFF0K   0.0  1.0  1.0\n3  doi:10.7910/DVN/0BPVCH   3.0  0.0  0.0\n4  doi:10.7910/DVN/0DE35E   2.0  0.0  1.0"
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.columns = ['doi', 'n', 'f', 's']\n",
    "dfs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def final(n, s, f):\n",
    "    a = '0'\n",
    "    b = '0'\n",
    "    c = '0'\n",
    "    if n > 0:\n",
    "        a = '1'\n",
    "    if s > 0:\n",
    "        b = '1'\n",
    "    if f > 0:\n",
    "        c = '1'\n",
    "    return a+b+c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs['final'] = dfs.apply(lambda x: final(x.n, x.s, x.f), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "001    655\n100    638\n110    235\n011    168\n101    144\n010    129\n111    116\nName: final, dtype: int64"
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.final.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "5180.0"
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.n.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "1472.0"
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.s.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "2223.0"
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.f.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>n</th>\n      <th>f</th>\n      <th>s</th>\n      <th>final</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>doi:10.7910/DVN/03CDTK</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>100</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>doi:10.7910/DVN/05BSPP</td>\n      <td>17.0</td>\n      <td>3.0</td>\n      <td>0.0</td>\n      <td>101</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>doi:10.7910/DVN/0BFF0K</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>1.0</td>\n      <td>011</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>doi:10.7910/DVN/0BPVCH</td>\n      <td>3.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>100</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>doi:10.7910/DVN/0DE35E</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>110</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                      doi     n    f    s final\n0  doi:10.7910/DVN/03CDTK   2.0  0.0  0.0   100\n1  doi:10.7910/DVN/05BSPP  17.0  3.0  0.0   101\n2  doi:10.7910/DVN/0BFF0K   0.0  1.0  1.0   011\n3  doi:10.7910/DVN/0BPVCH   3.0  0.0  0.0   100\n4  doi:10.7910/DVN/0DE35E   2.0  0.0  1.0   110"
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs['count']=dfs.n+dfs.f+dfs.s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>n</th>\n      <th>f</th>\n      <th>s</th>\n      <th>final</th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>679</th>\n      <td>doi:10.7910/DVN/FMJDCD</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>010</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1132</th>\n      <td>doi:10.7910/DVN/M7FYU8</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>010</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1607</th>\n      <td>doi:10.7910/DVN/SWV9GJ</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>010</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1781</th>\n      <td>doi:10.7910/DVN/VJTPJK</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>010</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1794</th>\n      <td>doi:10.7910/DVN/VSIAGW</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>010</td>\n      <td>3.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                         doi    n    f    s final  count\n679   doi:10.7910/DVN/FMJDCD  0.0  0.0  3.0   010    3.0\n1132  doi:10.7910/DVN/M7FYU8  0.0  0.0  3.0   010    3.0\n1607  doi:10.7910/DVN/SWV9GJ  0.0  0.0  3.0   010    3.0\n1781  doi:10.7910/DVN/VJTPJK  0.0  0.0  3.0   010    3.0\n1794  doi:10.7910/DVN/VSIAGW  0.0  0.0  3.0   010    3.0"
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs[(dfs.final=='010') & (dfs['count'] == 3.0)].tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get a sample for RQ 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 304,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>n</th>\n      <th>f</th>\n      <th>s</th>\n      <th>final</th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2023</th>\n      <td>doi:10.7910/DVN/Z02C8Y</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>010</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>787</th>\n      <td>doi:10.7910/DVN/H11ITR</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>2.0</td>\n      <td>010</td>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>2010</th>\n      <td>doi:10.7910/DVN/YT45AO</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>2.0</td>\n      <td>010</td>\n      <td>2.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                         doi    n    f    s final  count\n2023  doi:10.7910/DVN/Z02C8Y  0.0  0.0  1.0   010    1.0\n787   doi:10.7910/DVN/H11ITR  0.0  0.0  2.0   010    2.0\n2010  doi:10.7910/DVN/YT45AO  0.0  0.0  2.0   010    2.0"
     },
     "execution_count": 304,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs[(dfs.final=='010')].sample(n = 3) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>file</th>\n      <th>success</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1989</th>\n      <td>doi:10.7910/DVN/Z02C8Y</td>\n      <td>produce correl_plot.R</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                         doi                   file  success\n1989  doi:10.7910/DVN/Z02C8Y  produce correl_plot.R      1.0"
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1[df1.doi=='doi:10.7910/DVN/Z02C8Y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>file</th>\n      <th>success</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>3527</th>\n      <td>doi:10.7910/DVN/H11ITR</td>\n      <td>Modsel.R</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>3528</th>\n      <td>doi:10.7910/DVN/H11ITR</td>\n      <td>Vuong_R_Replication.r</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                         doi                   file  success\n3527  doi:10.7910/DVN/H11ITR               Modsel.R      1.0\n3528  doi:10.7910/DVN/H11ITR  Vuong_R_Replication.r      1.0"
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1[df1.doi=='doi:10.7910/DVN/H11ITR']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 311,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>file</th>\n      <th>r32</th>\n      <th>r36</th>\n      <th>r40</th>\n      <th>result</th>\n      <th>success</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>808</th>\n      <td>doi:10.7910/DVN/YT45AO</td>\n      <td>bound_fn.R</td>\n      <td>success</td>\n      <td>success</td>\n      <td>success</td>\n      <td>success</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>809</th>\n      <td>doi:10.7910/DVN/YT45AO</td>\n      <td>bound_main.R</td>\n      <td>Error in eval(expr, envir, enclos) : could not...</td>\n      <td>success</td>\n      <td>success</td>\n      <td>success</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                        doi          file  \\\n808  doi:10.7910/DVN/YT45AO    bound_fn.R   \n809  doi:10.7910/DVN/YT45AO  bound_main.R   \n\n                                                   r32      r36      r40  \\\n808                                            success  success  success   \n809  Error in eval(expr, envir, enclos) : could not...  success  success   \n\n      result  success  \n808  success      1.0  \n809  success      1.0  "
     },
     "execution_count": 311,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1[df1.doi=='doi:10.7910/DVN/YT45AO']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>file</th>\n      <th>r32</th>\n      <th>r36</th>\n      <th>r40</th>\n      <th>result</th>\n      <th>success</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1077</th>\n      <td>doi:10.7910/DVN/SWV9GJ</td>\n      <td>Add Study 1 - analysis code.R</td>\n      <td>Error in file(file, 'rt') : cannot open the co...</td>\n      <td>Error in file(file, 'rt') : cannot open the co...</td>\n      <td>success</td>\n      <td>success</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>1078</th>\n      <td>doi:10.7910/DVN/SWV9GJ</td>\n      <td>Add Study 2 - analysis code.R</td>\n      <td>Error in file(file, 'rt') : cannot open the co...</td>\n      <td>Error in file(file, 'rt') : cannot open the co...</td>\n      <td>success</td>\n      <td>success</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>5601</th>\n      <td>doi:10.7910/DVN/SWV9GJ</td>\n      <td>TESS_analysis_code.R</td>\n      <td>NaN</td>\n      <td>Error in alpha(sjt) : could not find function ...</td>\n      <td>success</td>\n      <td>success</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                         doi                           file  \\\n1077  doi:10.7910/DVN/SWV9GJ  Add Study 1 - analysis code.R   \n1078  doi:10.7910/DVN/SWV9GJ  Add Study 2 - analysis code.R   \n5601  doi:10.7910/DVN/SWV9GJ           TESS_analysis_code.R   \n\n                                                    r32  \\\n1077  Error in file(file, 'rt') : cannot open the co...   \n1078  Error in file(file, 'rt') : cannot open the co...   \n5601                                                NaN   \n\n                                                    r36      r40   result  \\\n1077  Error in file(file, 'rt') : cannot open the co...  success  success   \n1078  Error in file(file, 'rt') : cannot open the co...  success  success   \n5601  Error in alpha(sjt) : could not find function ...  success  success   \n\n      success  \n1077      1.0  \n1078      1.0  \n5601      1.0  "
     },
     "execution_count": 317,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1[df1.doi=='doi:10.7910/DVN/SWV9GJ']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}